Experiment 2 : EDA on Housing.csv¶
InĀ [34]:
import pandas as pd
df = pd.read_csv("Housing.csv")
df.head()
Out[34]:
| id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | ... | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7229300521 | 20141013T000000 | 231300.0 | 2 | 1.00 | 1180 | 5650 | 1.0 | 0 | 0 | ... | 7 | 1180 | 0 | 1955 | 0 | 98178 | 47.5112 | -122.257 | 1340 | 5650 |
| 1 | 6414100192 | 20141209T000000 | 538000.0 | 3 | 2.25 | 2570 | 7242 | 2.0 | 0 | 0 | ... | 7 | 2170 | 400 | 1951 | 1991 | 98125 | 47.7210 | -122.319 | 1690 | 7639 |
| 2 | 5631500400 | 20150225T000000 | 180000.0 | 2 | 1.00 | 770 | 10000 | 1.0 | 0 | 0 | ... | 6 | 770 | 0 | 1933 | 0 | 98028 | 47.7379 | -122.233 | 2720 | 8062 |
| 3 | 2487200875 | 20141209T000000 | 604000.0 | 4 | 3.00 | 1960 | 5000 | 1.0 | 0 | 0 | ... | 7 | 1050 | 910 | 1965 | 0 | 98136 | 47.5208 | -122.393 | 1360 | 5000 |
| 4 | 1954400510 | 20150218T000000 | 510000.0 | 3 | 2.00 | 1680 | 8080 | 1.0 | 0 | 0 | ... | 8 | 1680 | 0 | 1987 | 0 | 98074 | 47.6168 | -122.045 | 1800 | 7503 |
5 rows Ć 21 columns
InĀ [35]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 21613 entries, 0 to 21612 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 21613 non-null int64 1 date 21613 non-null object 2 price 21613 non-null float64 3 bedrooms 21613 non-null int64 4 bathrooms 21613 non-null float64 5 sqft_living 21613 non-null int64 6 sqft_lot 21613 non-null int64 7 floors 21613 non-null float64 8 waterfront 21613 non-null int64 9 view 21613 non-null int64 10 condition 21613 non-null int64 11 grade 21613 non-null int64 12 sqft_above 21613 non-null int64 13 sqft_basement 21613 non-null int64 14 yr_built 21613 non-null int64 15 yr_renovated 21613 non-null int64 16 zipcode 21613 non-null int64 17 lat 21613 non-null float64 18 long 21613 non-null float64 19 sqft_living15 21613 non-null int64 20 sqft_lot15 21613 non-null int64 dtypes: float64(5), int64(15), object(1) memory usage: 3.5+ MB
InĀ [36]:
df.describe()
Out[36]:
| id | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2.161300e+04 | 2.161300e+04 | 21613.000000 | 21613.000000 | 21613.000000 | 2.161300e+04 | 21613.000000 | 21613.000000 | 21613.000000 | 21613.000000 | 21613.000000 | 21613.000000 | 21613.000000 | 21613.000000 | 21613.000000 | 21613.000000 | 21613.000000 | 21613.000000 | 21613.000000 | 21613.000000 |
| mean | 4.580306e+09 | 5.400886e+05 | 3.370795 | 2.114757 | 2079.899736 | 1.510697e+04 | 1.494309 | 0.007542 | 0.234303 | 3.409430 | 7.656873 | 1788.390691 | 291.509045 | 1971.005136 | 84.402258 | 98077.939805 | 47.560053 | -122.213896 | 1986.552492 | 12768.455652 |
| std | 2.876570e+09 | 3.671268e+05 | 0.930105 | 0.770163 | 918.440897 | 4.142051e+04 | 0.539989 | 0.086517 | 0.766318 | 0.650743 | 1.175459 | 828.090978 | 442.575043 | 29.373411 | 401.679240 | 53.505026 | 0.138564 | 0.140828 | 685.391304 | 27304.179631 |
| min | 1.000102e+06 | 7.500000e+04 | 0.000000 | 0.000000 | 290.000000 | 5.200000e+02 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 290.000000 | 0.000000 | 1900.000000 | 0.000000 | 98001.000000 | 47.155900 | -122.519000 | 399.000000 | 651.000000 |
| 25% | 2.123049e+09 | 3.219500e+05 | 3.000000 | 1.750000 | 1427.000000 | 5.040000e+03 | 1.000000 | 0.000000 | 0.000000 | 3.000000 | 7.000000 | 1190.000000 | 0.000000 | 1951.000000 | 0.000000 | 98033.000000 | 47.471000 | -122.328000 | 1490.000000 | 5100.000000 |
| 50% | 3.904930e+09 | 4.500000e+05 | 3.000000 | 2.250000 | 1910.000000 | 7.618000e+03 | 1.500000 | 0.000000 | 0.000000 | 3.000000 | 7.000000 | 1560.000000 | 0.000000 | 1975.000000 | 0.000000 | 98065.000000 | 47.571800 | -122.230000 | 1840.000000 | 7620.000000 |
| 75% | 7.308900e+09 | 6.450000e+05 | 4.000000 | 2.500000 | 2550.000000 | 1.068800e+04 | 2.000000 | 0.000000 | 0.000000 | 4.000000 | 8.000000 | 2210.000000 | 560.000000 | 1997.000000 | 0.000000 | 98118.000000 | 47.678000 | -122.125000 | 2360.000000 | 10083.000000 |
| max | 9.900000e+09 | 7.700000e+06 | 33.000000 | 8.000000 | 13540.000000 | 1.651359e+06 | 3.500000 | 1.000000 | 4.000000 | 5.000000 | 13.000000 | 9410.000000 | 4820.000000 | 2015.000000 | 2015.000000 | 98199.000000 | 47.777600 | -121.315000 | 6210.000000 | 871200.000000 |
InĀ [37]:
df.shape
Out[37]:
(21613, 21)
InĀ [38]:
df.columns
Out[38]:
Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
'lat', 'long', 'sqft_living15', 'sqft_lot15'],
dtype='object')
InĀ [39]:
df = df.drop('date',axis=1)
InĀ [40]:
df = df.drop('id',axis=1)
InĀ [42]:
df['bedrooms'].unique()
Out[42]:
array([ 2, 3, 4, 5, 1, 6, 7, 0, 8, 9, 11, 10, 33])
InĀ [8]:
df['view'].unique()
Out[8]:
array([0, 3, 4, 2, 1])
InĀ [9]:
df.isnull().sum()
Out[9]:
id 0 price 0 bedrooms 0 bathrooms 0 sqft_living 0 sqft_lot 0 floors 0 waterfront 0 view 0 condition 0 grade 0 sqft_above 0 sqft_basement 0 yr_built 0 yr_renovated 0 zipcode 0 lat 0 long 0 sqft_living15 0 sqft_lot15 0 dtype: int64
InĀ [10]:
df[df.duplicated()]
#no duplicate data
Out[10]:
| id | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3951 | 1825069031 | 550000.0 | 4 | 1.75 | 2410 | 8447 | 2.0 | 0 | 3 | 4 | 8 | 2060 | 350 | 1936 | 1980 | 98074 | 47.6499 | -122.088 | 2520 | 14789 |
| 14983 | 6308000010 | 585000.0 | 3 | 2.50 | 2290 | 5089 | 2.0 | 0 | 0 | 3 | 9 | 2290 | 0 | 2001 | 0 | 98006 | 47.5443 | -122.172 | 2290 | 7984 |
| 20054 | 8648900110 | 555000.0 | 3 | 2.50 | 1940 | 3211 | 2.0 | 0 | 0 | 3 | 8 | 1940 | 0 | 2009 | 0 | 98027 | 47.5644 | -122.093 | 1880 | 3078 |
InĀ [11]:
df.drop_duplicates(inplace=True)
InĀ [12]:
df.shape
Out[12]:
(21610, 20)
InĀ [13]:
df.corr()
Out[13]:
| id | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| id | 1.000000 | -0.016769 | 0.001307 | 0.005091 | -0.012241 | -0.132101 | 0.018480 | -0.002717 | 0.011785 | -0.023687 | 0.008093 | -0.010859 | -0.005085 | 0.021217 | -0.016688 | -0.008126 | -0.001862 | 0.020776 | -0.002870 | -0.138778 |
| price | -0.016769 | 1.000000 | 0.308385 | 0.525142 | 0.702037 | 0.089663 | 0.256801 | 0.266371 | 0.397413 | 0.036366 | 0.667450 | 0.605570 | 0.323827 | 0.054009 | 0.126495 | -0.053195 | 0.307006 | 0.021622 | 0.585387 | 0.082449 |
| bedrooms | 0.001307 | 0.308385 | 1.000000 | 0.515983 | 0.576696 | 0.031709 | 0.175485 | -0.006578 | 0.079445 | 0.028452 | 0.357014 | 0.477633 | 0.303095 | 0.154296 | 0.018705 | -0.152807 | -0.008935 | 0.129483 | 0.391662 | 0.029244 |
| bathrooms | 0.005091 | 0.525142 | 0.515983 | 1.000000 | 0.754688 | 0.087750 | 0.500669 | 0.063747 | 0.187891 | -0.124940 | 0.664990 | 0.685351 | 0.283814 | 0.505997 | 0.050879 | -0.203831 | 0.024590 | 0.223047 | 0.568665 | 0.087191 |
| sqft_living | -0.012241 | 0.702037 | 0.576696 | 0.754688 | 1.000000 | 0.172830 | 0.353954 | 0.103820 | 0.284641 | -0.058768 | 0.762719 | 0.876600 | 0.435054 | 0.318099 | 0.055314 | -0.199435 | 0.052521 | 0.240221 | 0.756420 | 0.183285 |
| sqft_lot | -0.132101 | 0.089663 | 0.031709 | 0.087750 | 0.172830 | 1.000000 | -0.005171 | 0.021601 | 0.074753 | -0.008967 | 0.113644 | 0.183527 | 0.015271 | 0.053105 | 0.007677 | -0.129611 | -0.085680 | 0.229552 | 0.144620 | 0.718556 |
| floors | 0.018480 | 0.256801 | 0.175485 | 0.500669 | 0.353954 | -0.005171 | 1.000000 | 0.023711 | 0.029325 | -0.263779 | 0.458151 | 0.523874 | -0.245673 | 0.489347 | 0.006156 | -0.059025 | 0.049593 | 0.125343 | 0.279861 | -0.011250 |
| waterfront | -0.002717 | 0.266371 | -0.006578 | 0.063747 | 0.103820 | 0.021601 | 0.023711 | 1.000000 | 0.401993 | 0.016652 | 0.082785 | 0.072080 | 0.080585 | -0.026159 | 0.092950 | 0.030277 | -0.014272 | -0.041904 | 0.086469 | 0.030702 |
| view | 0.011785 | 0.397413 | 0.079445 | 0.187891 | 0.284641 | 0.074753 | 0.029325 | 0.401993 | 1.000000 | 0.045836 | 0.251378 | 0.167659 | 0.276996 | -0.053229 | 0.103208 | 0.084838 | 0.006049 | -0.078560 | 0.280404 | 0.072577 |
| condition | -0.023687 | 0.036366 | 0.028452 | -0.124940 | -0.058768 | -0.008967 | -0.263779 | 0.016652 | 0.045836 | 1.000000 | -0.144655 | -0.158212 | 0.174071 | -0.361347 | -0.060862 | 0.002962 | -0.014971 | -0.106512 | -0.092854 | -0.003424 |
| grade | 0.008093 | 0.667450 | 0.357014 | 0.664990 | 0.762719 | 0.113644 | 0.458151 | 0.082785 | 0.251378 | -0.144655 | 1.000000 | 0.755919 | 0.168443 | 0.446966 | 0.014372 | -0.184795 | 0.114086 | 0.198347 | 0.713208 | 0.119265 |
| sqft_above | -0.010859 | 0.605570 | 0.477633 | 0.685351 | 0.876600 | 0.183527 | 0.523874 | 0.072080 | 0.167659 | -0.158212 | 0.755919 | 1.000000 | -0.051923 | 0.423922 | 0.023233 | -0.261163 | -0.000823 | 0.343791 | 0.731870 | 0.194060 |
| sqft_basement | -0.005085 | 0.323827 | 0.303095 | 0.283814 | 0.435054 | 0.015271 | -0.245673 | 0.080585 | 0.276996 | 0.174071 | 0.168443 | -0.051923 | 1.000000 | -0.133062 | 0.071320 | 0.074781 | 0.110535 | -0.144743 | 0.200367 | 0.017260 |
| yr_built | 0.021217 | 0.054009 | 0.154296 | 0.505997 | 0.318099 | 0.053105 | 0.489347 | -0.026159 | -0.053229 | -0.361347 | 0.446966 | 0.423922 | -0.133062 | 1.000000 | -0.224729 | -0.346808 | -0.148099 | 0.409395 | 0.326298 | 0.070999 |
| yr_renovated | -0.016688 | 0.126495 | 0.018705 | 0.050879 | 0.055314 | 0.007677 | 0.006156 | 0.092950 | 0.103208 | -0.060862 | 0.014372 | 0.023233 | 0.071320 | -0.224729 | 1.000000 | 0.064388 | 0.029271 | -0.068594 | -0.002841 | 0.007837 |
| zipcode | -0.008126 | -0.053195 | -0.152807 | -0.203831 | -0.199435 | -0.129611 | -0.059025 | 0.030277 | 0.084838 | 0.002962 | -0.184795 | -0.261163 | 0.074781 | -0.346808 | 0.064388 | 1.000000 | 0.267064 | -0.564069 | -0.279033 | -0.147257 |
| lat | -0.001862 | 0.307006 | -0.008935 | 0.024590 | 0.052521 | -0.085680 | 0.049593 | -0.014272 | 0.006049 | -0.014971 | 0.114086 | -0.000823 | 0.110535 | -0.148099 | 0.029271 | 0.267064 | 1.000000 | -0.135545 | 0.048839 | -0.086423 |
| long | 0.020776 | 0.021622 | 0.129483 | 0.223047 | 0.240221 | 0.229552 | 0.125343 | -0.041904 | -0.078560 | -0.106512 | 0.198347 | 0.343791 | -0.144743 | 0.409395 | -0.068594 | -0.564069 | -0.135545 | 1.000000 | 0.334592 | 0.254475 |
| sqft_living15 | -0.002870 | 0.585387 | 0.391662 | 0.568665 | 0.756420 | 0.144620 | 0.279861 | 0.086469 | 0.280404 | -0.092854 | 0.713208 | 0.731870 | 0.200367 | 0.326298 | -0.002841 | -0.279033 | 0.048839 | 0.334592 | 1.000000 | 0.183194 |
| sqft_lot15 | -0.138778 | 0.082449 | 0.029244 | 0.087191 | 0.183285 | 0.718556 | -0.011250 | 0.030702 | 0.072577 | -0.003424 | 0.119265 | 0.194060 | 0.017260 | 0.070999 | 0.007837 | -0.147257 | -0.086423 | 0.254475 | 0.183194 | 1.000000 |
InĀ [45]:
# Univariate Analysis : analyze distribution of target variable(price)
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure()
sns.histplot(df['price'],kde=True)
plt.title("Price Distribution")
plt.show()
InĀ [67]:
# outlier detection : Checking extreme values in price
plt.figure()
sns.boxplot(x=df['price'])
plt.title("Price boxplot")
plt.show()
InĀ [53]:
# Correlation matrix : checking relationship between features.
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(17,12))
sns.heatmap(df.corr(),annot=True,cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()
InĀ [54]:
# Feature vs price Analysis
plt.figure()
sns.scatterplot(x='sqft_living',y='price',data=df)
plt.show()
InĀ [15]:
#conclusion- It is an imbalanced dataset
df.condition.value_counts().plot(kind='bar')
plt.xlabel("grade")## Visualization
plt.ylabel("Count")
plt.show()
InĀ [56]:
df.columns
Out[56]:
Index(['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
'waterfront', 'view', 'condition', 'grade', 'sqft_above',
'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
'sqft_living15', 'sqft_lot15'],
dtype='object')
InĀ [58]:
sns.histplot(df['sqft_living'],kde=True)
Out[58]:
<Axes: xlabel='sqft_living', ylabel='Count'>
InĀ [59]:
# univariate, bivariate, multivariate analysis
sns.pairplot(df)
Out[59]:
<seaborn.axisgrid.PairGrid at 0x1c2b4b06270>
InĀ [65]:
sns.catplot(x='bedrooms',y='price',data=df,kind="box")
Out[65]:
<seaborn.axisgrid.FacetGrid at 0x1c2fb14a710>